Applied Generative AI for AI Developers
Key metrics: Latency, Throughput, Memory, Cost
Trade-offs: Quick to implement but may introduce accuracy degradation
Key advantage: Better quality-performance trade-off
# PyTorch QAT example
import torch
# Prepare model for QAT
model.train()
model_qat = torch.quantization.prepare_qat(
model,
inplace=False
)
# Train with quantization in the loop
train_loop(model_qat, train_data)
# Convert to quantized model
model_quantized = torch.quantization.convert(
model_qat,
inplace=False
)Example: LLaMA 7B from 28GB (FP16) to 4GB (INT4)
Key insight: Not all weights equally impact model performance
Results: 40-60% size reduction with 90-95% performance
# Distillation loss example
def distillation_loss(student_logits,
teacher_logits,
temperature=2.0):
"""
KL divergence between teacher and
student softmax distributions
"""
soft_teacher = F.softmax(
teacher_logits / temperature,
dim=-1
)
soft_student = F.softmax(
student_logits / temperature,
dim=-1
)
return F.kl_div(
F.log_softmax(soft_student, dim=-1),
soft_teacher,
reduction='batchmean'
) * (temperature ** 2)Key idea: Smaller models can accurately predict some tokens
Source: Speculative Decoding paper
Impact: Reduces memory usage by 20-70%
# Conceptual sliding window attention
def sliding_window_attention(
query, key, value, window_size=1024
):
seq_len = query.shape[1]
if seq_len <= window_size:
# Standard attention
return standard_attention(query, key, value)
# Only attend to recent window_size tokens
recent_keys = key[:, -window_size:, :]
recent_values = value[:, -window_size:, :]
return standard_attention(
query, recent_keys, recent_values
)Results: 30-90% parameter reduction possible
Challenge: Requires specialized hardware/libraries
# Mixture of Experts conceptual example
class SparselyGatedMoE(nn.Module):
def __init__(self, input_size, output_size,
num_experts=8, k=2):
# Initialize experts and router
self.experts = nn.ModuleList([
nn.Linear(input_size, output_size)
for _ in range(num_experts)
])
self.router = nn.Linear(input_size, num_experts)
self.k = k # Top-k experts to use
def forward(self, x):
# Get router scores and select top-k experts
router_logits = self.router(x)
k_logits, indices = router_logits.topk(self.k)
# Only compute selected expert outputs
# ...Benefits: Enables inference of models too large for single GPU
Example: - 175B parameter model - 16 GPUs with tensor parallelism - Each GPU handles ~11B parameters - Coordinated through collective communication
Impact: 2-4x faster attention computation, enables longer contexts
# Using Flash Attention in PyTorch
from flash_attn import flash_attn_func
# Replace standard attention with:
attn_output = flash_attn_func(
q, # query
k, # key
v, # value
dropout_p=0.0,
causal=True # for decoder-only models
)
# With HuggingFace Transformers:
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Llama-2-7b",
attn_implementation="flash_attention_2"
)Impact: 2-10x throughput improvement
Key metrics: - Time-to-first-token (TTFT) - Time-per-output-token (TPOT) - Tokens-per-second (TPS) - Cost per 1M tokens